rmarkdown::render(‘./1_JP_scripts/1_QC_filtering_metrics_HARDAC.Rmd’)
Changes in myeloid and kidney cells after renal ischemia - Analysis of 2 x 10X scRNA-seq samples from 2 pools of WT (3 Sham + 3 CLP): comparison of gene expression in sham vs CLP in different cell populations
# Load the Cell Ranger Matrix Data and create the base Seurat
# object. we initialize the Seurat object
# (`CreateSeuratObject`) with the raw (non-normalized data).
library(Seurat)
dataset_loc <- "./rawData/"
ids <- c("C0", "C24")
d10x.data <- sapply(ids, function(i) {
d10x <- Read10X(file.path(dataset_loc, i, "filtered_feature_bc_matrix/"))
colnames(d10x) <- paste(sapply(strsplit(colnames(d10x), split = "-"),
"[[", 1L), i, sep = "--")
d10x
})
experiment.data <- do.call("cbind", d10x.data)
experiment.aggregate <- CreateSeuratObject(experiment.data, project = "ShamCLP-201008",
names.field = 2, names.delim = "\\-\\-")
experiment.aggregate
## An object of class Seurat
## 32285 features across 25133 samples within 1 assay
## Active assay: RNA (32285 features, 0 variable features)
Calculate percent UMIs mapping to the mitochondrial genome per cell
experiment.aggregate[["percent.mito"]] <- PercentageFeatureSet(experiment.aggregate,
pattern = "^mt-")
summary(experiment.aggregate[["percent.mito"]])
## percent.mito
## Min. : 0.3163
## 1st Qu.:16.7708
## Median :35.5812
## Mean :41.0215
## 3rd Qu.:59.7707
## Max. :99.0097
The original samples names (the names above in ids) can be found in the metadata slot, column orig.ident.
samplename = experiment.aggregate@meta.data$orig.ident
table(samplename)
## samplename
## C0 C24
## 14408 10725
sample.id = rep("C0", length(samplename))
sample.id[samplename %in% c("C24")] = "C24"
names(sample.id) = rownames(experiment.aggregate@meta.data)
experiment.aggregate <- AddMetaData(object = experiment.aggregate,
metadata = sample.id, col.name = "sample.id")
table(sample.id)
## sample.id
## C0 C24
## 14408 10725
outdir <- "./processedData/1_JP_analyses_results/Rerun_HARDAC_20210216/1_QC_filtering_metrics"
dir.create(outdir, recursive = T)
colors <- c("#12999E", "#FDA908")
names(colors) <- levels(as.factor(sample.id))
v <- VlnPlot(experiment.aggregate, features = c("nFeature_RNA",
"nCount_RNA", "percent.mito"), ncol = 3, cols = colors, group.by = "sample.id")
v
pdf(paste0(outdir, "/1_VlnPlots_nFeature_RNA_nCount_RNA_percent.mito_per_sample_before_filtering.pdf"),
width = 21, height = 9)
v
dev.off()
## png
## 2
f <- FeatureScatter(experiment.aggregate, feature1 = "nCount_RNA",
feature2 = "percent.mito", cols = colors, group.by = "sample.id")
f
pdf(paste0(outdir, "/2_FeatureScatter_percent.mito_vs_nCount_RNA_per_sample_before_filtering.pdf"))
f
dev.off()
## png
## 2
f <- FeatureScatter(experiment.aggregate, feature1 = "nCount_RNA",
feature2 = "nFeature_RNA", cols = colors, group.by = "sample.id")
f
pdf(paste0(outdir, "/3_FeatureScatter_nFeature_RNA_vs_nCount_RNA_per_sample_before_filtering.pdf"))
f
dev.off()
## png
## 2
hist(experiment.aggregate$nCount_RNA, xlab = "Library sizes",
breaks = 100, col = "grey80", ylab = "Number of cells", main = "Distribution of library sizes
(= total number of transcripts per cell = $nCount_RNA)",
cex.main = 1)
abline(v = 2000, col = "red")
pdf(paste0(outdir, "/4_Library_size.pdf"), width = 8, height = 9)
hist(experiment.aggregate$nCount_RNA, xlab = "Library sizes",
breaks = 100, col = "grey80", ylab = "Number of cells", main = "Distribution of library sizes
(= total number of transcripts per cell = $nCount_RNA)",
cex.main = 1)
abline(v = 2000, col = "red")
dev.off()
## png
## 2
saveRDS(experiment.aggregate, paste0(outdir, "/5.experiment.aggregate.158.rds"))
# experiment.aggregate <-
# readRDS('./processedData/1_QC_filtering_metrics/5.experiment.aggregate.158.rds'')
experiment.aggregate.sce <- as.SingleCellExperiment(experiment.aggregate)
experiment.aggregate.sce
## class: SingleCellExperiment
## dim: 32285 25133
## metadata(0):
## assays(2): counts logcounts
## rownames(32285): Xkr4 Gm1992 ... AC234645.1 AC149090.1
## rowData names(0):
## colnames(25133): AAACCCAAGAGGGTGG--C0 AAACCCAAGATGGCGT--C0 ...
## TTTGTTGTCTAAACGC--C24 TTTGTTGTCTCGACCT--C24
## colData names(6): orig.ident nCount_RNA ... sample.id ident
## reducedDimNames(0):
## altExpNames(0):
library(scater)
per.cell <- perCellQCMetrics(experiment.aggregate.sce, subsets = list(Mito = grep("^mt-",
rownames(experiment.aggregate.sce))))
summary(per.cell$sum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 500 2753 5354 7639 10682 63817
summary(experiment.aggregate$nCount_RNA)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 500 2753 5354 7639 10682 63817
summary(per.cell$detected)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 70 566 1192 1497 2200 7841
summary(per.cell$subsets_Mito_percent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3163 16.7708 35.5812 41.0215 59.7707 99.0097
colData(experiment.aggregate.sce) <- cbind(colData(experiment.aggregate.sce),
per.cell)
p <- plotColData(experiment.aggregate.sce, x = "sum", y = "detected",
colour_by = "sample.id")
p
pdf(paste0(outdir, "/6_plotColData_detected_vs_sum.pdf"), width = 9,
height = 8)
p
dev.off()
## png
## 2
p <- plotColData(experiment.aggregate.sce, x = "sum", y = "subsets_Mito_percent",
other_fields = "sample.id") + facet_wrap(~sample.id)
p
pdf(paste0(outdir, "/7_plotColData_subsets_Mito_percent_vs_sum.pdf"),
width = 15, height = 9)
p
dev.off()
## png
## 2
qc.stats <- quickPerCellQC(per.cell, percent_subsets = "subsets_Mito_percent")
colData(experiment.aggregate.sce) <- cbind(colData(experiment.aggregate.sce),
qc.stats)
colSums(as.matrix(qc.stats))
## low_lib_size low_n_features high_subsets_Mito_percent
## 0 0 0
## discard
## 0
experiment.aggregate.sce <- runColDataPCA(experiment.aggregate.sce,
variables = c("sum", "detected", "subsets_Mito_percent"))
p <- plotReducedDim(experiment.aggregate.sce, dimred = "PCA_coldata",
colour_by = "discard", size_by = "subsets_Mito_percent",
percentVar = 100 * attr(reducedDim(experiment.aggregate.sce),
"percentVar")) + ggtitle("PCA based on QC metrics / Detected outlier cells are highlighted") +
theme(plot.title = element_text(size = 7), legend.title = element_text(size = 7),
legend.text = element_text(size = 7))
p
pdf(paste0(outdir, "/8_plotReducedDim_QC_metrics.pdf"), width = 9,
height = 9)
p
dev.off()
## png
## 2
Now we can define a cell filter based on our previous analysis:
filter_by_expr_features <- (experiment.aggregate.sce$detected >=
200 & experiment.aggregate.sce$detected <= 9000)
table(filter_by_expr_features)
## filter_by_expr_features
## FALSE TRUE
## 1388 23745
filter_by_total_counts <- (experiment.aggregate.sce$sum >= 500 &
experiment.aggregate.sce$sum < 150000)
table(filter_by_total_counts)
## filter_by_total_counts
## TRUE
## 25133
filter_by_MT <- experiment.aggregate.sce$subsets_Mito_percent <=
55
table(filter_by_MT)
## filter_by_MT
## FALSE TRUE
## 7067 18066
# filtered <- experiment.aggregate.sce[,!qc.stats$discard]
experiment.aggregate.sce$use <- (
# sufficient features (genes)
filter_by_expr_features &
# sufficient molecules counted
filter_by_total_counts &
# remove cells with unusual number of reads in MT genes
filter_by_MT
)
table(experiment.aggregate.sce$use)
##
## FALSE TRUE
## 7078 18055
experiment.aggregate.sce$discard <- !experiment.aggregate.sce$use
table(experiment.aggregate.sce$discard)
##
## FALSE TRUE
## 18055 7078
experiment.aggregate.sce <- runColDataPCA(experiment.aggregate.sce,
variables = c("sum", "detected", "subsets_Mito_percent"))
p <- plotReducedDim(experiment.aggregate.sce, dimred = "PCA_coldata",
colour_by = "discard", size_by = "subsets_Mito_percent",
percentVar = 100 * attr(reducedDim(experiment.aggregate.sce),
"percentVar")) + ggtitle("PCA based on QC metrics / Manual outlier cells are highlighted") +
theme(plot.title = element_text(size = 7), legend.title = element_text(size = 7),
legend.text = element_text(size = 7))
p
pdf(paste0(outdir, "/9_plotReducedDim_QC_metrics_manual_filtering.pdf"),
width = 9, height = 9)
p
dev.off()
## png
## 2
filtered <- experiment.aggregate.sce[, experiment.aggregate.sce$use]
p <- plotColData(filtered, x = "sum", y = "subsets_Mito_percent",
other_fields = "sample.id") + facet_wrap(~sample.id)
p
pdf(paste0(outdir, "/10_plotColData_subsets_Mito_percent_vs_sum_post_filtering.pdf"),
width = 15, height = 9)
p
dev.off()
## png
## 2
dim(filtered)
## [1] 32285 18055
table(filtered$sample.id)
##
## C0 C24
## 11131 6924
summary(filtered$detected)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 206.0 836.5 1538.0 1792.9 2576.0 7841.0
summary(filtered$subsets_Mito_percent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3163 11.3079 25.7262 25.6500 38.2506 54.9932
per.feat <- perFeatureQCMetrics(experiment.aggregate.sce)
summary(per.feat$mean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0009 0.2366 0.0480 579.3806
summary(per.feat$detected)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.07958 4.63628 4.00668 100.00000
ave <- calculateAverage(experiment.aggregate.sce)
summary(ave)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0011 0.2366 0.0525 573.4458
summary(nexprs(experiment.aggregate.sce, byrow = TRUE))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 20 1165 1007 25133
plotHighestExprs(experiment.aggregate.sce, exprs_values = "counts")
pdf(paste0(outdir, "/11_plotHighestExprs.pdf"), width = 7, height = 7)
plotHighestExprs(experiment.aggregate.sce, exprs_values = "counts")
dev.off()
## png
## 2
keep_feature <- nexprs(experiment.aggregate.sce, byrow = TRUE) >
0
filtered <- experiment.aggregate.sce[keep_feature, experiment.aggregate.sce$use]
dim(filtered)
## [1] 22399 18055
We can investigate the relative importance of different explanatory factors with the plotExplanatoryVariables function. This function computes the percentage of variance in gene expression that is explained by variables in the sample-level metadata. It allows problematic factors to be quickly identified, as well as the genes that are most affected. This is best done on the log-expression values to reduce the effect of the mean on the variance - hence, we run normalize first. This can take a while to calculate and plot.
experiment.aggregate.sce <- logNormCounts(experiment.aggregate.sce)
vars <- getVarianceExplained(experiment.aggregate.sce, variables = c("sample.id",
"sum", "detected", "subsets_Mito_percent"))
head(vars)
## sample.id sum detected subsets_Mito_percent
## Xkr4 0.006943212 0.0003875252 0.010707344 0.010367705
## Gm1992 NaN NaN NaN NaN
## Gm19938 0.006475890 0.0036193873 0.009556241 0.004387266
## Gm37381 0.008258246 0.0574859723 0.084895733 0.001749579
## Rp1 0.114455198 0.0838337212 0.198380080 0.044357580
## Sox17 0.046244971 0.3358676296 0.105142397 2.814385055
Plot explanatory variables ordered by percentage of variance explained
plotExplanatoryVariables(vars)
filtered <- logNormCounts(filtered)
vars <- getVarianceExplained(filtered, variables = c("sample.id",
"sum", "detected", "subsets_Mito_percent"))
head(vars)
## sample.id sum detected subsets_Mito_percent
## Xkr4 0.007104581 0.0002976112 5.734868e-03 0.0024295009
## Gm19938 0.006728322 0.0036912228 5.271952e-03 0.0002947912
## Gm37381 0.009610374 0.0630397123 9.446692e-02 0.0010401689
## Rp1 0.114915087 0.0836476322 1.610593e-01 0.0005540464
## Sox17 0.167635588 0.4421780293 1.115945e-02 3.5530867920
## Gm37587 0.007028329 0.0136398415 3.062177e-05 0.0583494803
plotExplanatoryVariables(vars)
filtered <- experiment.aggregate[keep_feature, experiment.aggregate.sce$use]
dim(filtered)
## [1] 22399 18055
v <- VlnPlot(filtered, features = c("nFeature_RNA", "nCount_RNA",
"percent.mito"), ncol = 3, cols = colors, group.by = "sample.id")
v
pdf(paste0(outdir, "/12_VlnPlots_nFeature_RNA_nCount_RNA_percent.mito_per_sample_post_filtering.pdf"),
width = 21, height = 9)
v
dev.off()
## png
## 2
f <- FeatureScatter(filtered, feature1 = "nCount_RNA", feature2 = "percent.mito",
cols = colors, group.by = "sample.id")
f
pdf(paste0(outdir, "/13_FeatureScatter_percent.mito_vs_nCount_RNA_per_sample_post_filtering.pdf"))
f
dev.off()
## png
## 2
f <- FeatureScatter(filtered, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
cols = colors, group.by = "sample.id")
f
pdf(paste0(outdir, "/14_FeatureScatter_nFeature_RNA_vs_nCount_RNA_per_sample_post_filtering.pdf"))
f
dev.off()
## png
## 2
saveRDS(filtered, paste0(outdir, "/15.filtered.398.rds"))
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server release 6.8 (Santiago)
##
## Matrix products: default
## BLAS: /gpfs/fs1/data/omicscore/Privratsky-Privratsky-20210215/scripts/conda/envs/privratsky/lib/libblas.so.3.8.0
## LAPACK: /gpfs/fs1/data/omicscore/Privratsky-Privratsky-20210215/scripts/conda/envs/privratsky/lib/liblapack.so.3.8.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] scater_1.18.0 ggplot2_3.3.3
## [3] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
## [5] Biobase_2.50.0 GenomicRanges_1.42.0
## [7] GenomeInfoDb_1.26.0 IRanges_2.24.0
## [9] S4Vectors_0.28.0 BiocGenerics_0.36.0
## [11] MatrixGenerics_1.2.0 matrixStats_0.58.0
## [13] SeuratObject_4.0.0 Seurat_4.0.0
##
## loaded via a namespace (and not attached):
## [1] ggbeeswarm_0.6.0 Rtsne_0.15
## [3] colorspace_2.0-0 deldir_0.2-9
## [5] ellipsis_0.3.1 ggridges_0.5.3
## [7] scuttle_1.0.0 XVector_0.30.0
## [9] BiocNeighbors_1.8.0 spatstat.data_2.0-0
## [11] farver_2.0.3 leiden_0.3.7
## [13] listenv_0.8.0 ggrepel_0.9.1
## [15] sparseMatrixStats_1.2.0 codetools_0.2-18
## [17] splines_4.0.3 knitr_1.31
## [19] polyclip_1.10-0 jsonlite_1.7.2
## [21] ica_1.0-2 cluster_2.1.1
## [23] png_0.1-7 uwot_0.1.10
## [25] shiny_1.6.0 sctransform_0.3.2
## [27] compiler_4.0.3 httr_1.4.2
## [29] Matrix_1.3-2 fastmap_1.1.0
## [31] lazyeval_0.2.2 formatR_1.7
## [33] BiocSingular_1.6.0 later_1.1.0.1
## [35] htmltools_0.5.1.1 tools_4.0.3
## [37] rsvd_1.0.3 igraph_1.2.6
## [39] gtable_0.3.0 glue_1.4.2
## [41] GenomeInfoDbData_1.2.4 RANN_2.6.1
## [43] reshape2_1.4.4 dplyr_1.0.4
## [45] Rcpp_1.0.6 spatstat_1.64-1
## [47] scattermore_0.7 vctrs_0.3.6
## [49] nlme_3.1-152 DelayedMatrixStats_1.12.0
## [51] lmtest_0.9-38 xfun_0.20
## [53] stringr_1.4.0 globals_0.14.0
## [55] beachmat_2.6.0 mime_0.10
## [57] miniUI_0.1.1.1 lifecycle_1.0.0
## [59] irlba_2.3.3 goftest_1.2-2
## [61] future_1.21.0 MASS_7.3-53.1
## [63] zlibbioc_1.36.0 zoo_1.8-8
## [65] scales_1.1.1 promises_1.2.0.1
## [67] spatstat.utils_2.0-0 RColorBrewer_1.1-2
## [69] yaml_2.2.1 reticulate_1.18
## [71] pbapply_1.4-3 gridExtra_2.3
## [73] rpart_4.1-15 stringi_1.5.3
## [75] highr_0.8 BiocParallel_1.24.0
## [77] rlang_0.4.10 pkgconfig_2.0.3
## [79] bitops_1.0-6 evaluate_0.14
## [81] lattice_0.20-41 ROCR_1.0-11
## [83] purrr_0.3.4 tensor_1.5
## [85] labeling_0.4.2 patchwork_1.1.1
## [87] htmlwidgets_1.5.3 cowplot_1.1.1
## [89] tidyselect_1.1.0 parallelly_1.23.0
## [91] RcppAnnoy_0.0.18 plyr_1.8.6
## [93] magrittr_2.0.1 R6_2.5.0
## [95] generics_0.1.0 DelayedArray_0.16.0
## [97] withr_2.4.1 pillar_1.4.7
## [99] mgcv_1.8-33 fitdistrplus_1.1-3
## [101] survival_3.2-7 abind_1.4-5
## [103] RCurl_1.98-1.2 tibble_3.0.6
## [105] future.apply_1.7.0 crayon_1.4.1
## [107] KernSmooth_2.23-18 plotly_4.9.3
## [109] rmarkdown_2.6 viridis_0.5.1
## [111] grid_4.0.3 data.table_1.13.6
## [113] digest_0.6.27 xtable_1.8-4
## [115] tidyr_1.1.2 httpuv_1.5.5
## [117] munsell_0.5.0 beeswarm_0.2.3
## [119] viridisLite_0.3.0 vipor_0.4.5
writeLines(capture.output(sessionInfo()), "./scripts/1_JP_scripts/1_QC_filtering_metrics_HARDAC.sessionInfo.txt")